clear
clear matrix
set matsize 1000
set more off


*this command opens the BSD dataset for the year 2005
use ".........\bsd2005.dta", clear

*this command only keeps active firms
keep if active==1

*the following generate and replace commands create the aggregare industry breakdown compatible with the input-output tables
gen nace2=substr(sic, 1, 2)
gen aggregate_ind=.
replace aggregate_ind=1 if nace2=="01"
replace aggregate_ind=1 if nace2=="02"
replace aggregate_ind=1 if nace2=="05"
replace aggregate_ind=2 if nace2=="10"
replace aggregate_ind=2 if nace2=="11"
replace aggregate_ind=2 if nace2=="13"
replace aggregate_ind=2 if nace2=="14"
replace aggregate_ind=3 if nace2=="15"
replace aggregate_ind=3 if nace2=="16"
replace aggregate_ind=4 if nace2=="17"
replace aggregate_ind=4 if nace2=="18"
replace aggregate_ind=4 if nace2=="19"
replace aggregate_ind=5 if nace2=="20"
replace aggregate_ind=6 if nace2=="21"
replace aggregate_ind=6 if nace2=="22"
replace aggregate_ind=7 if nace2=="23"
replace aggregate_ind=8 if nace2=="24"
replace aggregate_ind=9 if nace2=="25"
replace aggregate_ind=10 if nace2=="26"
replace aggregate_ind=11 if nace2=="27"
replace aggregate_ind=12 if nace2=="28"
replace aggregate_ind=13 if nace2=="29"
replace aggregate_ind=14 if nace2=="30"
replace aggregate_ind=15 if nace2=="31"
replace aggregate_ind=16 if nace2=="32"
replace aggregate_ind=17 if nace2=="33"
replace aggregate_ind=18 if nace2=="34"
replace aggregate_ind=19 if nace2=="35"
replace aggregate_ind=20 if nace2=="36"
replace aggregate_ind=20 if nace2=="37"
replace aggregate_ind=21 if nace2=="40"
replace aggregate_ind=21 if nace2=="41"
replace aggregate_ind=22 if nace2=="45"
replace aggregate_ind=23 if nace2=="50"
replace aggregate_ind=23 if nace2=="51"
replace aggregate_ind=23 if nace2=="52"
replace aggregate_ind=24 if nace2=="55"
replace aggregate_ind=25 if nace2=="60"
replace aggregate_ind=25 if nace2=="61"
replace aggregate_ind=25 if nace2=="62"
replace aggregate_ind=25 if nace2=="63"
replace aggregate_ind=26 if nace2=="64"
replace aggregate_ind=27 if nace2=="65"
replace aggregate_ind=27 if nace2=="66"
replace aggregate_ind=27 if nace2=="67"
replace aggregate_ind=28 if nace2=="70"
replace aggregate_ind=29 if nace2=="71"
replace aggregate_ind=30 if nace2=="72"
replace aggregate_ind=31 if nace2=="73"
replace aggregate_ind=32 if nace2=="74"
replace aggregate_ind=33 if nace2=="75"
replace aggregate_ind=34 if nace2=="80"
replace aggregate_ind=35 if nace2=="85"
replace aggregate_ind=35 if nace2=="90"
replace aggregate_ind=35 if nace2=="91"
replace aggregate_ind=35 if nace2=="92"
replace aggregate_ind=35 if nace2=="93"
replace aggregate_ind=35 if nace2=="95"
replace aggregate_ind=35 if nace2=="98"
replace aggregate_ind=35 if nace2=="99"

*we drop cases that we cannot assign
drop if aggregate_ind==.

*we drop cases with missing or zero employment (number of employees excluding the owners) 
drop if employees==.
drop if employees==0
rename employees empfte


*In what follows the compute and substract "R&D related workers"
sort aggregate_ind
*the file we merge has info on the share of R&D outlays as a % of revenue (from OECD STAN) 
merge aggregate_ind using  ".....\aggregate_ind_R_D_UK.dta"
egen empfte_aggregate_ind=sum(empfte), by(aggregate_ind)
gen pippo=1
egen nfirms_aggregate_ind=sum(pippo), by(aggregate_ind)
gen cut=(empfte_aggregate_ind*(r_and_d_percent/100))/nfirms_aggregate_ind
su cut, de
replace empfte=empfte-cut

*we eliminate firms ending up, after accounting for "R&D related workers", with zero or negative employment 
drop if empfte<=0


*we now trim the data
egen p1_e=pctile(empfte), p(1.5) 
egen p99_e=pctile(empfte), p(98.5) 

drop if empfte<=p1_e
drop if empfte>=p99_e
drop p1_e p99_e


*finally we compute what we need
gen ln_empfte=log(empfte)
egen tot_emp_ind=sum(empfte), by(aggregate_ind)
gen mark_share_2=(empfte/tot_emp_ind)^2
egen double SD_log_emp=sd(ln_empfte), by(aggregate_ind)
egen double Mean_log_emp=mean(ln_empfte), by(aggregate_ind)
gen double dev_mean_4=(ln_empfte-Mean_log_emp)^4
egen double Mean_dev_mean_4=mean(dev_mean_4), by(aggregate_ind)
gen double Sigma_4=SD_log_emp^4

gen n_firms=1

collapse (sum) n_firms mark_share_2 (mean) SD_log_emp Sigma_4 Mean_dev_mean_4, by(aggregate_ind)

save "....\bs_data_all_fin_robust_RD.dta", replace
export excel using ".....\bs_data_all_fin_robust_RD.dta.xls", replace
